For 3rd Data Science Symposium, at South Dakota State University. Feb 10, 2020

Get ready

#Make sure you have a recent version of R and Rstudio
install.packages("tidyverse")
install.packages(c("plotly", "shiny", "shinyBS"))
install.packages("ggfortify")
install.packages("DataExplorer")
install.packages("GGally")
install.packages(c("VennDiagram","UpSetR") )
install.packages("gganimate")
install.packages("visNetwork")
install.packages("BiocManager")
BiocManager::install("ComplexHeatmap") 
install.packages("kableExtra")

Yes, I can do it!

The state data set

Start a R Notebook

Go to File, New File , R NoteBook.

Data wrangling

First take a look at the state data.

state.x77 <- data.frame(state.x77)   
sta <- cbind(state.abb, state.x77, state.region) #combine the three data sets
colnames(sta)[1] <- "State"                #Rename first column
colnames(sta)[10] <- "Region"              #Rename the 10th column
sta$Pop.Density <- sta$Population/sta$Area
head(sta) 
str(sta)
## 'data.frame':    50 obs. of  11 variables:
##  $ State      : Factor w/ 50 levels "AK","AL","AR",..: 2 1 4 3 5 6 7 8 9 10 ...
##  $ Population : num  3615 365 2212 2110 21198 ...
##  $ Income     : num  3624 6315 4530 3378 5114 ...
##  $ Illiteracy : num  2.1 1.5 1.8 1.9 1.1 0.7 1.1 0.9 1.3 2 ...
##  $ Life.Exp   : num  69 69.3 70.5 70.7 71.7 ...
##  $ Murder     : num  15.1 11.3 7.8 10.1 10.3 6.8 3.1 6.2 10.7 13.9 ...
##  $ HS.Grad    : num  41.3 66.7 58.1 39.9 62.6 63.9 56 54.6 52.6 40.6 ...
##  $ Frost      : num  20 152 15 65 20 166 139 103 11 60 ...
##  $ Area       : num  50708 566432 113417 51945 156361 ...
##  $ Region     : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
##  $ Pop.Density: num  0.071291 0.000644 0.019503 0.04062 0.135571 ...

Scatter plot

Let’s start with this scatterplot. Try to run the first line but without the “+” at the end. Then gradually add one line at a time without the “+” at the end and see the effect. You can select the lines and then press Ctrl+Enter to run. Things are done in steps and layers in ggplot2. That’s where the “gg” comes from ( the Grammar of Graphics).

library(ggplot2)
ggplot(sta, aes(x = Illiteracy, y = Murder)) + 
  geom_point(aes(size = Pop.Density, color = Region)) + 
  geom_smooth(method = 'lm',formula = y ~ x) +  # add regression line
  theme(plot.title = element_text(hjust = 0.5)) +
  ylim(c(0.5, 16)) +
  xlim(c(0, 3)) +
  ggtitle("Murder rate Vs Illiteracy") +
  labs(y="Murders per 100,000", x="Illiteracy rate 1970", caption="Source: R") +
  # guides(color=guide_legend("US Region"), size=guide_legend("Population Density")) +
  guides(colour = guide_legend("US Region", order = 1), # change order
         size = guide_legend("Pop. Density", order = 2)) + 
  theme(legend.position="bottom", legend.box = "horizontal") + 
  annotate("text",  x= 2.5, y = 2, label = paste("R=",round(cor(sta$Murder, sta$Illiteracy), 2)),hjust= 3) +
  geom_text(aes(label=State),hjust=0, vjust=0)

This kind of layout also makes it easy to comment out one line at a time to see the effect. This is done with the first guides verb. Each line starts with a verb that adds or customize something on the plot.

Challenge

Create a scatter plot to visualize the relationship between illiteracy rate and life expectancy. Color data points by region and vary the size of the data points according to income. Note: Start building step by step. Copy the copy from above line by line. Change xlim and ylim. Change axis labels. Change the Pearson’s correlation coefficient, and where it will appear on the plot using x, and y coordinates.

Lollipop plot

ggplot(sta, aes(x = State, y = Population)) +
  geom_point( size=3, color="red", fill=alpha("orange", 0.3), alpha=0.5, shape=20, stroke=2) +
  geom_segment(aes(x = State, xend = State, y = 0, yend = Population)) +
  labs(title = "Lollipop Chart for Population") +
#  coord_flip()  + 
  theme( plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90, vjust = 0.5)) 

Challenge

Try to see what happens by uncommenting the coord_flip line. And then create a lollipop plot for Frost (the number of days).

Density plot with ridgeline

# install.packages("ggridges")
library(ggridges)
ggplot(sta, aes(x = Murder, y = Region, fill = Region)) +
  geom_density_ridges() +
  theme_ridges() + 
  theme(legend.position = "none", 
        plot.title = element_text(hjust = 0.5),
        axis.title.x = element_text(hjust = 0.5), 
        axis.title.y = element_text(hjust = 0.5))
## Picking joint bandwidth of 1.4

Notice that the last line is split into 3 lines, all indended nicely.

Challenge

Use ridgeline plot to explore the regional distribution of Illiteracy for state.x77 and state.region data sets and interpret your figure.

Boxplot with jitter shows the raw data points

library(ggplot2)
ggplot(sta, aes(x = Region, y = Frost)) + 
  geom_boxplot()+
  geom_jitter(position=position_jitter(0.2))

Challenge

Change it to visualize income. Fill the plain boxplots with 4 different colors for the 4 boxes.

st <- sta[, 2:9] #take numeric variables as goal matrix
# install.packages( c("ellipse", "corrplot") )
library(ellipse)
## 
## Attaching package: 'ellipse'
## The following object is masked from 'package:graphics':
## 
##     pairs
library(corrplot)
## corrplot 0.84 loaded
corMatrix <- cor(as.matrix(st)) # Calculate correlation matrix
col <- colorRampPalette(c("red", "yellow", "blue"))   #Color values. Red, yellow and blue represent that the coefficients are -1, 0 and 1 respectively. You can use more than 3 colors to represent the coefficients ranging from -1 to 1.
corrplot.mixed(corMatrix, order = "AOE", lower = "number", lower.col = "black", 
               number.cex = .8, upper = "ellipse",  upper.col = col(10), 
               diag = "u", tl.pos = "lt", tl.col = "black") #Type ?corrplot.mixed in the Console to get help in detail.

plot(hclust(as.dist(1 - cor(as.matrix(st)))))  # hierarchical clustering

Heat maps made easy with ComplexHeatmap

library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.2.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## ========================================
st.matrix <- as.matrix(st) # transfer the data frame to matrix
st.matrix <- apply(st.matrix, 2, function(y)(y - mean(y)) / sd(y))  # standardize data
row_ha = rowAnnotation( Region = sta$Region)
Heatmap(st.matrix, left_annotation = row_ha)

library(ComplexHeatmap)
st.matrix <- as.matrix(st) # transfer the data frame to matrix
st.matrix <- apply(st.matrix, 2, function(y)(y - mean(y)) / sd(y))  # standardize data
row_ha <- rowAnnotation( Region = sta$Region, Murder = anno_barplot(sta$Murder))
col_ha <- HeatmapAnnotation(Random = runif(8), foo = anno_density(st.matrix))
Heatmap(st.matrix, 
        left_annotation = row_ha, 
        top_annotation = col_ha)

Challenge

Add a annotation to the right of the heat map. Using points to represents income for 50 states. Hint: Make another column annotation function and then use right_annotation

Maps are also easy

#install.packages(c("maps","mapproj"))
library(ggplot2)
library(maps)
sta$region <- tolower(state.name)  # Create new character vector with lowercase states names
states <- map_data("state")        # Extract state data
map <- merge(states, sta, by = "region", all.x = T)  # Merge states and state.x77 data
map <- map[order(map$order), ]
p=ggplot(map, aes(x = long, y = lat, group = group)) +  
  geom_polygon(aes(fill = Murder)) +   
  geom_path() + 
  scale_fill_gradientn(colours = rev(heat.colors(10))) +
  coord_map() +
  labs(x = "Longitude", y = "Latitude") +
  guides(fill = guide_legend(title = "Murder Rate")) +  
  theme(plot.title = element_text(hjust = 0.5))

Challenge

Similar to above, create a new code chuck below to use choropleth map to obtain the Illiteracy map in state.x77 data set and give a brief interpretation. Hint: You can combine state.abb to state.x77 or use the row names of state.x77 data set directly.

Interactive plots with plotly

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ComplexHeatmap':
## 
##     add_heatmap
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- ggplot(sta, aes(x = Illiteracy, y = Murder)) + 
  geom_point(aes(color = Region)) + 
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Murder rate Vs Illiteracy") +
  labs(y="Murders per 100,000", x="Illiteracy rate 1970", caption="Source: R") +
  annotate("text",  x= 2.5, y = 2, 
           label = paste("R=",round(cor(sta$Murder, sta$Illiteracy), 2))) 

ggplotly(p)

Challenge

Create this interactive plot using the Iris dataset (iris). You can change the hover text using the toolpit option for ggplotly. See this page.

## Warning: Ignoring unknown aesthetics: Petal.Width, Species

Interactive maps

data(canada.cities, package = "maps")
viz <- ggplot(canada.cities, aes(long, lat)) +
  borders(regions = "canada") +
  coord_equal() +
  geom_point(aes(text = name, size = pop), colour = "red", alpha = 1/2)
## Warning: Ignoring unknown aesthetics: text
ggplotly(viz, tooltip = c("text", "size"))

The heart attack dataset

df <- read.table("http://statland.org/R/RC/heartatk4R.txt", 
                         header = TRUE, 
                         sep = "\t", 
                         colClasses = c("character", "factor", "factor", "factor", 
                                        "factor", "numeric", "numeric", "numeric"))
str(df)
## 'data.frame':    12844 obs. of  8 variables:
##  $ Patient  : chr  "1" "2" "3" "4" ...
##  $ DIAGNOSIS: Factor w/ 9 levels "41001","41011",..: 5 5 9 8 9 9 9 9 5 5 ...
##  $ SEX      : Factor w/ 2 levels "F","M": 1 1 1 1 2 2 1 1 2 1 ...
##  $ DRG      : Factor w/ 3 levels "121","122","123": 2 2 2 2 2 1 1 1 1 3 ...
##  $ DIED     : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ CHARGES  : num  4752 3941 3657 1481 1681 ...
##  $ LOS      : num  10 6 5 2 1 9 15 15 2 1 ...
##  $ AGE      : num  79 34 76 80 55 84 84 70 76 65 ...
ggplot(df, aes(x = AGE, fill = SEX)) + 
       geom_density(alpha = 0.3) + facet_grid(DRG ~.)  # Figure 5.4C

DataExplorer automates E.D.A.

library(DataExplorer)
introduce(df)
plot_intro(df)

plot_missing(df)

plot_qq(df, by="DRG", sampled_rows = 1000L)
## Warning: Removed 49 rows containing non-finite values (stat_qq).
## Warning: Removed 49 rows containing non-finite values (stat_qq_line).

plot_boxplot(df, by = "DIAGNOSIS")
## Warning: Removed 699 rows containing non-finite values (stat_boxplot).

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(sta[, 2:6])

Venn Diagrams and upset plots

table(df$SEX, df$DIED)
##    
##        0    1
##   F 4298  767
##   M 7136  643
table(df$SEX)
## 
##    F    M 
## 5065 7779
table(df$DIED)
## 
##     0     1 
## 11434  1410
library(VennDiagram)
## Loading required package: futile.logger
## 
## Attaching package: 'VennDiagram'
## The following object is masked from 'package:ellipse':
## 
##     ellipse
grid.newpage()
draw.pairwise.venn( nrow(subset(df, SEX == "F")), 
                    nrow(subset(df, DIED == "1")), 
                    nrow(subset(df, SEX == "F" & DIED == "1")), 
                    category = c("Women", "Died"), 
                    lty = rep("blank",2), 
                    fill = c("light blue", "pink"), 
                    alpha = rep(0.5, 2), 
                    cat.pos = c(0,  0), 
                    cat.dist = rep(0.025, 2), 
                    scaled = FALSE)

## (polygon[GRID.polygon.3494], polygon[GRID.polygon.3495], polygon[GRID.polygon.3496], polygon[GRID.polygon.3497], text[GRID.text.3498], text[GRID.text.3499], text[GRID.text.3500], text[GRID.text.3501], text[GRID.text.3502])
movies = read.csv(system.file("extdata", "movies.csv", package = "UpSetR"), 
    header = TRUE, sep = ";")
head(movies)
m = make_comb_mat(movies, top_n_sets = 6, remove_complement_set = TRUE)
UpSet(m)

visNetwork

library(visNetwork)
nodes <- data.frame(id = 1:7)

edges <- data.frame(
  from = c(1,2,2,2,3,3),
  to = c(2,3,4,5,6,7)
)
visNetwork(nodes, edges, width = "100%") %>% 
  visEdges(arrows = "from")